01 - reanalysis¶

We took the original FASTQ files from GSE200534 and preprocessed them using nf-core/marsseq pipeline.

nextflow run ...
In [1]:
%matplotlib inline
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

import glob
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted
from matplotlib import rcParams

random_seed = 12345
np.random.seed(random_seed)
sc.settings.set_figure_params(dpi=80)
sc.settings.verbosity = 3
In [2]:
def ignore_n_rows(csv_file):
    with open(csv_file) as f:
        for index, line in enumerate(f):
            if line.startswith("Well"):
                return index
        return 0


def load_metadata():
    metadata = pd.read_excel(
        "../data/raw/Design file_Sara_Corrected_plusColumn.xlsx",
        usecols=range(1, 20),
        index_col=0,
    )
    facs_data = []
    for csv_file in metadata["Index sort"].unique():
        print(f"Processing {csv_file}")
        file: str = f"../data/raw/facs-data/{csv_file}.csv"
        skip_rows = ignore_n_rows(file)
        csv_content = pd.read_csv(file, skiprows=skip_rows, index_col=0, thousands=",")
        csv_content.columns = csv_content.columns.str.strip().str.replace(" ", ".")
        csv_content = csv_content[
            [
                "All.Events.FSC-A.Geo.Mean",
                "All.Events.SSC-A.Geo.Mean",
                "All.Events.GFP-A.Geo.Mean",
                "All.Events.mCherry.561D-A.Geo.Mean",
                "All.Events.DAPI-A.Geo.Mean",
            ]
        ]
        csv_content = csv_content.astype(float)

        design_sub = metadata[metadata["Index sort"] == csv_file]
        design_merged = design_sub.merge(
            csv_content, left_on="Well_coordinates", right_index=True, how="left"
        )
        facs_data.append(design_merged)
    facs_data = pd.concat(facs_data, sort=False)

    # fix proper order
    facs_data = facs_data.loc[natsorted(facs_data.index)]

    facs_data = facs_data[
        [
            "Well_coordinates",
            "plate_ID",
            "Amp_batch_ID",
            "Cell_barcode",
            "Pool_barcode",
            "Batch",
            "Condition",
            "SampleName",
            "Stage",
            "Source",
            "Day",
            "Index sort",
            "All.Events.FSC-A.Geo.Mean",
            "All.Events.SSC-A.Geo.Mean",
            "All.Events.GFP-A.Geo.Mean",
            "All.Events.mCherry.561D-A.Geo.Mean",
            "All.Events.DAPI-A.Geo.Mean",
        ]
    ]

    return facs_data

Create object¶

In [3]:
sb_1 = natsorted(glob.glob('../data/raw/SB1/umi.tab/*'))
sb_2 = natsorted(glob.glob('../data/raw/SB2/umi.tab/*'))

batch_files = np.concatenate([sb_1, sb_2])
counts = pd.concat([pd.read_table(x) for x in batch_files], axis=1)
In [4]:
facs_content = load_metadata()
Processing Plate 1A_2iLIF
Processing Plate 2A_D0_2iLIF
Processing Plate 2A_D0_rest
Processing Plate 3B_D1_2iLIF
Processing Plate 3B_D1_rest
Processing Plate 4A_D2_2iLIF
Processing Plate 4A_D2_rest
Processing Plate 5A_D3_2iLIF
Processing Plate 5A_D3_rest
Processing Plate 6A_D4_2iLIF
Processing Plate 6A_D4_rest
Processing Plate 7A_D7_2iLIF
Processing Plate 7A_D7_rest
In [5]:
adata = sc.AnnData(X=counts.T, obs=facs_content)
adata.obs['plate_ID'] = adata.obs['plate_ID'].astype('category')
adata
Out[5]:
AnnData object with n_obs × n_vars = 2496 × 53638
    obs: 'Well_coordinates', 'plate_ID', 'Amp_batch_ID', 'Cell_barcode', 'Pool_barcode', 'Batch', 'Condition', 'SampleName', 'Stage', 'Source', 'Day', 'Index sort', 'All.Events.FSC-A.Geo.Mean', 'All.Events.SSC-A.Geo.Mean', 'All.Events.GFP-A.Geo.Mean', 'All.Events.mCherry.561D-A.Geo.Mean', 'All.Events.DAPI-A.Geo.Mean'

FACS¶

In [6]:
fig, ax = plt.subplots(1, 2, figsize=[20, 6], sharey=True, sharex=True)
sns.kdeplot(x='All.Events.mCherry.561D-A.Geo.Mean', y='All.Events.GFP-A.Geo.Mean', hue='Stage', data=adata.obs, ax=ax[0])
sns.scatterplot(x='All.Events.mCherry.561D-A.Geo.Mean', y='All.Events.GFP-A.Geo.Mean', hue='Stage', data=adata.obs, ax=ax[1])

plt.xlim(-2_000, 5_000)
plt.ylim(-1_000, 4_200)
Out[6]:
(-1000.0, 4200.0)
In [7]:
rcParams['figure.figsize'] = 10, 5
sns.kdeplot(x='All.Events.DAPI-A.Geo.Mean', hue='plate_ID', data=adata.obs)
plt.axvline(x=320, color='r')
Out[7]:
<matplotlib.lines.Line2D at 0x7f05c72a5dc0>
In [8]:
adata.obs[['plate_ID', 'All.Events.DAPI-A.Geo.Mean']].groupby('plate_ID').describe()
Out[8]:
All.Events.DAPI-A.Geo.Mean
count mean std min 25% 50% 75% max
plate_ID
1 380.0 312.076316 129.469802 74.0 211.75 282.5 401.0 653.0
2 373.0 257.021448 126.962295 67.0 170.00 226.0 314.0 630.0
3 143.0 199.748252 45.764933 8.0 168.50 201.0 233.5 271.0
4 379.0 164.749340 45.180566 2.0 133.00 169.0 202.5 236.0
5 377.0 150.339523 37.621242 24.0 121.00 153.0 182.0 214.0
6 378.0 159.735450 51.429578 5.0 117.00 161.0 203.0 260.0
7 380.0 176.439474 53.891458 17.0 141.75 180.5 221.0 259.0

Filtering¶

In [9]:
# adata = adata[adata.obs.plate_ID != 1]
adata = adata[adata.obs['All.Events.DAPI-A.Geo.Mean'] <= 320.0]
adata = adata[adata.obs.Stage != 'Zero']
In [10]:
adata.obs[['plate_ID', 'All.Events.DAPI-A.Geo.Mean']].groupby('plate_ID').describe()
Out[10]:
All.Events.DAPI-A.Geo.Mean
count mean std min 25% 50% 75% max
plate_ID
1 227.0 222.933921 54.505338 74.0 182.50 226.0 267.50 320.0
2 286.0 198.164336 62.649436 67.0 150.25 191.5 246.75 320.0
3 143.0 199.748252 45.764933 8.0 168.50 201.0 233.50 271.0
4 379.0 164.749340 45.180566 2.0 133.00 169.0 202.50 236.0
5 377.0 150.339523 37.621242 24.0 121.00 153.0 182.00 214.0
6 378.0 159.735450 51.429578 5.0 117.00 161.0 203.00 260.0
7 380.0 176.439474 53.891458 17.0 141.75 180.5 221.00 259.0
In [11]:
adata.shape
Out[11]:
(2170, 53638)
In [12]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')
adata.var['ercc'] = adata.var_names.str.startswith('ERCC-')
adata.var['ribo'] = adata.var_names.str.startswith(('Rpl', 'Rps'))

sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ercc', 'ribo'], percent_top=None, log1p=False, inplace=True)
Trying to set attribute `.var` of view, copying.
In [13]:
fig, ax = plt.subplots(2, 2, figsize=(20, 10))
sns.violinplot(y=adata.obs['n_genes_by_counts'], orient='v', ax=ax[0][0])
sns.violinplot(y=adata.obs['total_counts'], orient='v', ax=ax[0][1])
sns.violinplot(y=adata.obs['pct_counts_mt'], orient='v', ax=ax[1][0])
sns.violinplot(y=adata.obs['pct_counts_ercc'], orient='v', ax=ax[1][1])
Out[13]:
<AxesSubplot:ylabel='pct_counts_ercc'>
In [14]:
fig, ax = plt.subplots(1,3, figsize=(20, 5), sharey=True)
sns.scatterplot(x='total_counts', y='n_genes_by_counts', ax=ax[0], data=adata.obs, hue='Stage')
sns.scatterplot(x='total_counts', y='n_genes_by_counts', ax=ax[1], hue='pct_counts_mt', data=adata.obs)
sns.scatterplot(x='total_counts', y='n_genes_by_counts', ax=ax[2], hue='pct_counts_ribo', data=adata.obs)
Out[14]:
<AxesSubplot:xlabel='total_counts', ylabel='n_genes_by_counts'>
In [15]:
min_cells, max_cells = 1_400, 15_000
min_genes, max_genes = 800, 4_500

p = sns.jointplot(x='total_counts', y='n_genes_by_counts', data=adata.obs, marginal_kws=dict(bins=100), height=10)
# min
p.ax_joint.axvline(x=min_cells, color='red')
p.ax_joint.axhline(y=min_genes, color='red')

# max
p.ax_joint.axvline(x=max_cells, color='green')
p.ax_joint.axhline(y=max_genes, color='green')
Out[15]:
<matplotlib.lines.Line2D at 0x7f05c682d880>
In [16]:
print(f'Before filtering: {adata.shape}')

# Remove ERCC cells
adata = adata[:, ~adata.var_names.str.startswith('ERCC-')]

sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_cells(adata, min_counts=min_cells)
sc.pp.filter_cells(adata, max_counts=max_cells)
sc.pp.filter_cells(adata, max_genes=max_genes)

sc.pp.filter_genes(adata, min_cells=3)
print(f'After filtering: {adata.shape}')
Before filtering: (2170, 53638)
filtered out 100 cells that have less than 800 genes expressed
Trying to set attribute `.obs` of view, copying.
filtered out 6 cells that have less than 1400 counts
filtered out 5 cells that have more than 15000 counts
filtered out 4 cells that have more than 4500 genes expressed
filtered out 33087 genes that are detected in less than 3 cells
After filtering: (2055, 20459)
In [17]:
fig, ax = plt.subplots(1,5, figsize=(25, 5))
sns.violinplot(y=adata.obs['n_genes_by_counts'], orient='v', ax=ax[0])
sns.violinplot(y=adata.obs['total_counts'], orient='v', ax=ax[1])
sns.violinplot(y=adata.obs['pct_counts_mt'], orient='v', ax=ax[2])
sns.violinplot(y=adata.obs['pct_counts_ercc'], orient='v', ax=ax[3])
sns.violinplot(y=adata.obs['pct_counts_ribo'], orient='v', ax=ax[4])
Out[17]:
<AxesSubplot:ylabel='pct_counts_ribo'>
In [18]:
adata.layers['raw'] = adata.X.copy()

Normalize¶

In [19]:
sc.pp.normalize_total(adata, target_sum=10_000)
sc.pp.log1p(adata)
adata.raw = adata
normalizing counts per cell
    finished (0:00:00)
In [20]:
sc.pp.highly_variable_genes(adata, n_top_genes=2_000)
sc.pl.highly_variable_genes(adata)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [21]:
sc.pp.scale(adata, max_value=5)
In [22]:
cc_genes = pd.read_table("https://raw.githubusercontent.com/theislab/single-cell-tutorial/master/Macosko_cell_cycle_genes.txt", delimiter='\t')
s_genes = cc_genes['S'].dropna()
g2m_genes = cc_genes['G2.M'].dropna()

s_genes_mm = [gene.lower().capitalize() for gene in s_genes]
g2m_genes_mm = [gene.lower().capitalize() for gene in g2m_genes]

s_genes_hvg = adata.var_names[np.in1d(adata.var_names, s_genes_mm)]
g2m_genes_hvg = adata.var_names[np.in1d(adata.var_names, g2m_genes_mm)]

sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes_hvg, g2m_genes=g2m_genes_hvg)
calculating cell cycle phase
computing score 'S_score'
    finished: added
    'S_score', score of gene set (adata.obs).
    1765 total control genes are used. (0:00:00)
computing score 'G2M_score'
    finished: added
    'G2M_score', score of gene set (adata.obs).
    1772 total control genes are used. (0:00:00)
-->     'phase', cell cycle phase (adata.obs)
In [23]:
adata.obs.phase.value_counts()
Out[23]:
G2M    1349
S       515
G1      191
Name: phase, dtype: int64
In [24]:
sc.tl.pca(adata, svd_solver='arpack', random_state=random_seed)
sc.pl.pca_variance_ratio(adata, log=True)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
In [25]:
sc.pl.pca(adata, color=["Batch", "Stage"])
... storing 'Well_coordinates' as categorical
... storing 'Amp_batch_ID' as categorical
... storing 'Cell_barcode' as categorical
... storing 'Pool_barcode' as categorical
... storing 'Batch' as categorical
... storing 'Condition' as categorical
... storing 'SampleName' as categorical
... storing 'Stage' as categorical
... storing 'Source' as categorical
... storing 'Day' as categorical
... storing 'Index sort' as categorical
... storing 'phase' as categorical
In [26]:
sc.pp.neighbors(adata, random_state=random_seed)
sc.tl.umap(adata, random_state=random_seed)
sc.tl.louvain(adata, random_state=random_seed)
sc.tl.leiden(adata, random_state=random_seed)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:04)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'louvain', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
    finished: found 9 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [27]:
rcParams['figure.figsize'] = 7, 7
sc.pl.umap(adata, color=['Batch', 'Stage', 'louvain', 'leiden'], ncols=2, frameon=False)
In [28]:
# Remove cluster 7
adata = adata[adata.obs.leiden != '8']
In [29]:
rcParams['figure.figsize'] = 7, 7
sc.pl.umap(adata, color=['Batch', 'Stage', 'louvain', 'leiden'], ncols=2, frameon=False)
In [30]:
sc.pl.umap(adata, color=['Pou5f1', 'Nanog', 'Gata6', 'Sox17'], ncols=2, frameon=False)
In [31]:
sc.pl.umap(adata, color=['All.Events.GFP-A.Geo.Mean', 
                         'All.Events.mCherry.561D-A.Geo.Mean',
                         'All.Events.DAPI-A.Geo.Mean'])
In [32]:
sc.tl.embedding_density(adata, basis='umap', groupby='Stage')
sc.pl.embedding_density(adata, basis='umap', key='umap_density_Stage', ncols=3)
computing density on 'umap'
Trying to set attribute `.obs` of view, copying.
--> added
    'umap_density_Stage', densities (adata.obs)
    'umap_density_Stage_params', parameter (adata.uns)

Markers¶

In [33]:
sc.tl.rank_genes_groups(adata, groupby='leiden')
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:03)
In [34]:
sc.tl.dendrogram(adata, groupby="leiden")
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_leiden']`
WARNING: Gene labels are not shown when more than 50 genes are visualized. To show gene labels set `show_gene_labels=True`
In [35]:
markers = []
for cluster in np.sort(adata.obs.leiden.unique()):
    df = sc.get.rank_genes_groups_df(adata, group=cluster, log2fc_min=1, pval_cutoff=0.05)
    df = df.sort_values(by='logfoldchanges', ascending=False).head(25)
    df['cluster'] = cluster
    markers.append(df)
markers = pd.concat(markers)
markers.to_csv("../results/01_louvain_markers.csv")

Save session¶

In [36]:
adata.write("../data/processed/01_dataset.h5ad")
In [ ]:
!jupyter nbconvert --to html 01_analysis.ipynb --output-dir="../reports"

Velocity¶

In [ ]:
import scvelo as scv

scv.settings.verbosity = 3
scv.settings.presenter_view = True
scv.settings.set_figure_params('scvelo')
In [ ]:
adata = sc.read("../data/processed/11_fulldataset.h5ad")

# adata = adata[adata.obs.Stage != '2iLIF_PrE']
adata = adata[~adata.obs.louvain.isin(['2', '5'])]

adata.X = adata.layers['raw']
In [ ]:
adata.obs['barcodes'] = adata.obs['Pool_barcode'].str.strip() + adata.obs['Cell_barcode'].str.strip()

sb1 = buildAnndataFromStar("../data/raw/velocity/SB1/Solo.out/")
sb2 = buildAnndataFromStar("../data/raw/velocity/SB2/Solo.out/")

# Merge velocity
sb1_obs = adata.obs[adata.obs.Batch == 'SB1'].reset_index().set_index('barcodes')
sb2_obs = adata.obs[adata.obs.Batch == 'SB2'].reset_index().set_index('barcodes')

common_cells_sb1 = np.intersect1d(sb1_obs.index, sb1.obs_names)
common_cells_sb2 = np.intersect1d(sb2_obs.index, sb2.obs_names)

sb1 = sb1[common_cells_sb1]
sb2 = sb2[common_cells_sb2]

sb1.obs = sb1_obs.loc[common_cells_sb1]
sb2.obs = sb2_obs.loc[common_cells_sb2]

# merge
merged = sb1.concatenate(sb2, index_unique='')
merged.obs = merged.obs.set_index('Well_ID')
In [ ]:
adata_velocity = scv.utils.merge(adata, merged)
In [ ]:
adata_velocity.shape
In [ ]:
scv.pl.proportions(adata_velocity, groupby='louvain')
In [ ]:
adata_velocity.uns['iroot'] = np.flatnonzero(adata_velocity.obs['louvain'] == '4')[0]
In [ ]:
adata_velocity = run_scvelo(adata_velocity)
In [ ]:
rcParams['figure.figsize'] = [12, 12]
scv.pl.velocity_embedding_stream(adata_velocity, basis='umap', color=['louvain', 'Stage'])
In [ ]:
rcParams['figure.figsize'] = [6, 6]
scv.pl.velocity_embedding_stream(adata_velocity, basis='umap', color=['Stage'])
In [ ]:
rcParams['figure.figsize'] = [12, 12]
scv.pl.velocity_embedding_grid(adata_velocity, basis='umap', color=['louvain', 'Stage'], arrow_length=3, legend_loc = 'right')
In [ ]:
scv.tl.latent_time(adata_velocity, root_key='iroot')
In [ ]:
rcParams['figure.figsize'] = [12, 12]
scv.pl.scatter(adata_velocity, color=['leiden', 'latent_time'], color_map='gnuplot', size=80, colorbar=True)
In [ ]:
scv.tl.terminal_states(adata_velocity)
scv.pl.scatter(adata_velocity, color=['root_cells', 'end_points'])
In [ ]:
scv.tl.velocity_confidence(adata_velocity)
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata_velocity, c=keys, cmap='coolwarm', perc=[5, 95])
In [ ]:
rcParams['figure.figsize'] = [6, 6]
scv.pl.velocity_graph(adata_velocity, threshold=.1)
In [ ]:
scv.tl.velocity_pseudotime(adata_velocity)
scv.pl.scatter(adata_velocity, color='velocity_pseudotime', cmap='gnuplot')
In [ ]:
adata_velocity.uns['neighbors']['distances'] = adata_velocity.obsp['distances']
adata_velocity.uns['neighbors']['connectivities'] = adata_velocity.obsp['connectivities']

scv.tl.paga(adata_velocity, groups='louvain')
df = scv.get_df(adata_velocity, 'paga/transitions_confidence', precision=2).T
df.style.background_gradient(cmap='Blues').format('{:.2g}')
In [ ]:
scv.pl.paga(adata_velocity, basis='umap', size=50, alpha=.1, min_edge_width=2, node_size_scale=1.5)

Extra¶

In [ ]:
top_genes = adata_velocity.var['fit_likelihood'].sort_values(ascending=False).index
scv.pl.scatter(adata_velocity, basis=top_genes[:15], ncols=5, frameon=False)
In [ ]:
scv.tl.rank_dynamical_genes(adata_velocity, groupby='clusters')
df = scv.get_df(adata_velocity, 'rank_dynamical_genes/names')
df.head(20)
In [ ]:
for cluster in adata_velocity.obs.louvain.unique():
    scv.pl.scatter(adata_velocity, df[cluster][:5], ylabel=cluster, frameon=False)
In [ ]:
scv.pl.velocity(adata_velocity, ['Col4a1', 'Col4a2'], ncols=1, figsize=[6, 8])

Save session¶

In [ ]:
adata_velocity.write('../data/processed/11_fulldataset_velo.h5ad')
In [ ]: